{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Cosine Similarity\n",
    "### Law and Courts Newsletter\n",
    "### code by Rachael Hinkle with guest appearances by Morgan Hazelton\n",
    "### June 24, 2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##Those who need to install NLTK should refer to: https://www.nltk.org/install.html\n",
    "##Those running NLTK’s stopwords the first time will have to run ``import nltk'' and then ``nltk.download(`stopwords')''."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Import the libraries you will need for this session (analogous to R packages)\n",
    "\n",
    "# 'os' provides useful funtions for dealing with the operating system\n",
    "# 're' provides the ability to use regular expressions\n",
    "# 'csv' provides the ability to write the results to a .csv file\n",
    "# 'nltk' provides many tools from the Natural Language Toolkit\n",
    "\n",
    "import os, re, csv\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import TreebankWordTokenizer\n",
    "\n",
    "# Import a list of English stop words\n",
    "english_stops = set(stopwords.words('english'))\n",
    "\n",
    "# Import a tokenizer that will break up text into parts\n",
    "tokenizer = TreebankWordTokenizer()\n",
    "\n",
    "# Import some stuff to do the fancy math to get cosine scores\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "### The only difference in this code for Windows is the way filepaths are set up. They use a different file seperator.\n",
    "## Macs use a single foward slash; if you are using a mac run the follwing code:\n",
    "fileSeperator = \"/\"\n",
    "## Windows uses a double back slash: if you are using Windows use the following code,\n",
    "# Note, to do this delete the \"#\" at the beginning of the next line.\n",
    "#fileSeperator = \"\\\\\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Setup a shortcut reference to your working directory\n",
    "# Python tip: This just creates a variable with a string.\n",
    "# You can name the variable anything you want.\n",
    "mydir = os.getcwd() + fileSeperator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "### How to read in each document from a .txt file in a subfolder\n",
    "#   and output the data to a spreadsheet with one row for every combination between\n",
    "#   two documents with the cosine similarity between them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['PenRegister_1208.531.F.2d.809.txt', 'PenRegister_1378.555.F.2d.254.txt', 'DNAsample_17063.490.F.3d.1178.txt', 'DNAsample_7200.504.F.3d.1.txt', 'DNAsample_17257.483.F.3d.73.txt', 'PenRegister_1310.546.F.2d.243.txt']\n"
     ]
    }
   ],
   "source": [
    "## Prepare information for where to read text files from\n",
    "\n",
    "#create a filepath to the folder where all text files are located\n",
    "#This file includes three briefs regarding a DNA sample issue in a search and seizure case\n",
    "#The remaining three briefs are related to a phone tapping issue (a pen register)\n",
    "#The document titles identify to which issue the brief pertains\n",
    "#We anticipate and find that the briefs dealing with the same type of issue are more similar\n",
    "dirname = mydir + \"newCosineSampleFiles/\"\n",
    "# Create a list of all files in a given folder\n",
    "dirlist = os.listdir(dirname)\n",
    "# See the list of all files to make sure things are working\n",
    "print(dirlist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PenRegister_1208.531.F.2d.809.txt\n",
      "PenRegister_1378.555.F.2d.254.txt\n",
      "DNAsample_17063.490.F.3d.1178.txt\n",
      "DNAsample_7200.504.F.3d.1.txt\n",
      "DNAsample_17257.483.F.3d.73.txt\n",
      "PenRegister_1310.546.F.2d.243.txt\n"
     ]
    }
   ],
   "source": [
    "## Read in each document, process text, and create Python objects in appropriate format\n",
    "\n",
    "opinions = []\n",
    "opinionIDs = []\n",
    "\n",
    "fileIDs = []\n",
    "\n",
    "for entry in dirlist:\n",
    "    \n",
    "    infilehandle = open(dirname + entry, encoding='latin-1')\n",
    "    txtlines = infilehandle.readlines()\n",
    "    \n",
    "    # This is useful for watching the progress of the code, espcially with many documents\n",
    "    print(entry)\n",
    "    \n",
    "    # Create empty variables (each time through the loop it is important to make sure\n",
    "    # all variables are empty so values from the previous document do not carry over)\n",
    "    opin_string = \"\"\n",
    "    op_line = False\n",
    "\n",
    "    # Loop through each line in the document\n",
    "    for txtline in txtlines:\n",
    "        \n",
    "        # Find the beginning and end of the opinion\n",
    "        if re.search(\"^[^A-Za-z]*OPINION[\\s]*$\", txtline):\n",
    "            op_line = True       \n",
    "        if op_line and (re.search(\"^[\\s]*DISSENT\", txtline)):\n",
    "            opin = False\n",
    "        if op_line and (re.search(\"^[\\s]*CONCUR\", txtline)):\n",
    "            opin = False\n",
    "        if op_line and (re.search(\"^[^A-Za-z]*APPENDI\", txtline)):\n",
    "            opin = False\n",
    "       \n",
    "        # Create a string with the majority opinion and process the text\n",
    "        if op_line:\n",
    "            opin_string = opin_string + txtline\n",
    "\n",
    "     \n",
    "    ## Pre-Process text in opinion string\n",
    "    # Convert to lower case\n",
    "    opin_string = opin_string.lower()\n",
    "    # Turn newlines and returns into spaces\n",
    "    opin_string = re.sub(\"\\r|\\n\", \" \", opin_string)\n",
    "    # Get rid of apostraphes inside of words\n",
    "    opin_string = re.sub(\"'+\", \"\", opin_string)\n",
    "    # Get rid of excess quotation marks\n",
    "    opin_string = re.sub('\"+', '', opin_string)\n",
    "    # Get rid of hypens\n",
    "    opin_string = re.sub(\"([a-z])-([a-z])\", \"\\g<1>\\g<2>\", opin_string)\n",
    "            \n",
    "    # Turn one giant string into a list of all tokens (separate by spaces)\n",
    "    tokens = tokenizer.tokenize(opin_string)\n",
    "\n",
    "    opinion = \"\"\n",
    "    for token in tokens:\n",
    "        #Only keep tokens longer than 2 characters\n",
    "        if len(token) > 2:\n",
    "            #Only keep tokens that consist of letters and nothing else\n",
    "            #Note: this is why we got rid of punctuation inside of words \n",
    "            if re.search(\"^[a-z]+$\", token):\n",
    "                #Only keep tokens that are not stopwords\n",
    "                if token not in english_stops:\n",
    "                    opinion = opinion + \" \" + token\n",
    "\n",
    "    opinions.append(opinion)\n",
    "    opinionIDs.append(entry)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' opinion christensen senior district suppressed parte proceeding united states district court northern district illinois eastern division department justice applied granted september order authorized bureau alcohol tobacco firearms department treasury install pen register device particular telephone order also affirmatively required appellant illinois bell telephone company telephone company provide facilities information technical assistance necessary accomplish interception unobtrusively compe'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# View the first 500 tokens of the first opinion\n",
    "opinions[0][0:500]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doc 1: PenRegister_1208.531.F.2d.809.txt  Doc2: PenRegister_1378.555.F.2d.254.txt   Cosine: 0.613\n",
      "Doc 1: PenRegister_1208.531.F.2d.809.txt  Doc2: DNAsample_17063.490.F.3d.1178.txt   Cosine: 0.142\n",
      "Doc 1: PenRegister_1208.531.F.2d.809.txt  Doc2: DNAsample_7200.504.F.3d.1.txt   Cosine: 0.186\n",
      "Doc 1: PenRegister_1208.531.F.2d.809.txt  Doc2: DNAsample_17257.483.F.3d.73.txt   Cosine: 0.158\n",
      "Doc 1: PenRegister_1208.531.F.2d.809.txt  Doc2: PenRegister_1310.546.F.2d.243.txt   Cosine: 0.829\n",
      "Doc 1: PenRegister_1378.555.F.2d.254.txt  Doc2: DNAsample_17063.490.F.3d.1178.txt   Cosine: 0.107\n",
      "Doc 1: PenRegister_1378.555.F.2d.254.txt  Doc2: DNAsample_7200.504.F.3d.1.txt   Cosine: 0.152\n",
      "Doc 1: PenRegister_1378.555.F.2d.254.txt  Doc2: DNAsample_17257.483.F.3d.73.txt   Cosine: 0.129\n",
      "Doc 1: PenRegister_1378.555.F.2d.254.txt  Doc2: PenRegister_1310.546.F.2d.243.txt   Cosine: 0.6\n",
      "Doc 1: DNAsample_17063.490.F.3d.1178.txt  Doc2: DNAsample_7200.504.F.3d.1.txt   Cosine: 0.719\n",
      "Doc 1: DNAsample_17063.490.F.3d.1178.txt  Doc2: DNAsample_17257.483.F.3d.73.txt   Cosine: 0.771\n",
      "Doc 1: DNAsample_17063.490.F.3d.1178.txt  Doc2: PenRegister_1310.546.F.2d.243.txt   Cosine: 0.139\n",
      "Doc 1: DNAsample_7200.504.F.3d.1.txt  Doc2: DNAsample_17257.483.F.3d.73.txt   Cosine: 0.771\n",
      "Doc 1: DNAsample_7200.504.F.3d.1.txt  Doc2: PenRegister_1310.546.F.2d.243.txt   Cosine: 0.183\n",
      "Doc 1: DNAsample_17257.483.F.3d.73.txt  Doc2: PenRegister_1310.546.F.2d.243.txt   Cosine: 0.153\n"
     ]
    }
   ],
   "source": [
    "## Create and open a .csv file that we will write information into\n",
    "fout = open(mydir + \"myCosineSpreadsheet.csv\", \"w\", newline=\"\")\n",
    "outfilehandle = csv.writer(fout,\n",
    "                           delimiter=\",\",\n",
    "                           quotechar='\"',\n",
    "                           quoting=csv.QUOTE_NONNUMERIC)\n",
    "\n",
    "# Create a row that contains the variables names and write that to the first row of the .csv file\n",
    "localrow = []\n",
    "localrow.append(\"opinion1\")\n",
    "localrow.append(\"opinion2\")\n",
    "localrow.append(\"cosine\")\n",
    "outfilehandle.writerow(localrow)\n",
    "\n",
    "## Calculate cosine similarity scores\n",
    "train_set = opinions\n",
    "\n",
    "tfidf_vectorizer = TfidfVectorizer()\n",
    "tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)\n",
    "\n",
    "for row in range(1, len(opinions)+1):\n",
    "\n",
    "    for col in range(1, len(opinions)+1):\n",
    "        #Save computation time by only calculating each pair once (instead of twice)\n",
    "        if col > row:    \n",
    "            cosine = cosine_similarity(tfidf_matrix_train[row-1:row], tfidf_matrix_train[col-1:col])\n",
    "            cosine = float(cosine)\n",
    "\n",
    "            #Present results in a tidy fashion\n",
    "            print(\"Doc 1: \" + opinionIDs[row-1] + \"  Doc2: \" + opinionIDs[col-1] + \"   Cosine: \" + str(round(cosine, 3)))\n",
    "\n",
    "            # Write a row to the spreadsheet for each pairwise combo of opinions\n",
    "            localrow = []\n",
    "            localrow.append(opinionIDs[row-1])\n",
    "            localrow.append(opinionIDs[col-1])\n",
    "            localrow.append(cosine)\n",
    "            outfilehandle.writerow(localrow)\n",
    "\n",
    "\n",
    "infilehandle.close()\n",
    "fout.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
